In this documents the IPO Prospectus PDF-files will be read into text data, processable in Python. Afterwards it will be processed, cleaned and evaluated.¶


Load required packages...
In [4]:
import os, re
import pandas as pd
import numpy as np
import bs4
from pdfminer.high_level import extract_text
import tqdm
import gc
In [309]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
In [5]:
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['tok2vec','tagger','attribute_ruler','lemmatizer','ner','parser'])
nlp.enable_pipe("senter")

DATA GENERATION: Create a dictionary of all file names in folder

In [307]:
path = "C:\\NLP\\Prospectus\\Risk Faktors\\"
os.chdir(path)

Text_data = {'Issuer/Borrower PermID': [] , 'content': [] }   

DATA GENERATION: Store text into dictionary:

In [308]:
for file in tqdm.tqdm(os.listdir()):

    if file.endswith(".pdf"):
        try:
            text = extract_text(path + file)
            text = u' '.join(text.split())
            Text_data['content'].append(text)
            Text_data['Issuer/Borrower PermID'].append(file.split('.', 1)[0])
            del text
            del soup
            gc.collect()

        except:
            pass


    if file.endswith(".htm") or file.endswith(".html"):
        try:
            with open(path + file ,"r",encoding="utf-8") as f:
                text = f.read()
            soup=bs4.BeautifulSoup(text)
            text = u' '.join(soup.get_text().split())
            Text_data['content'].append(text)
            Text_data['Issuer/Borrower PermID'].append(file.split('.', 1)[0])
            del text
            del soup
            gc.collect()

        except:
            pass
    
    else:
        continue
100%|██████████| 222/222 [22:39<00:00,  6.13s/it]  
In [309]:
csv_save = pd.DataFrame(Text_data)
csv_save.to_csv('Text_data.csv', encoding='utf-8',index=False)

Further processing of text:

Split into sentences and count number of words:

In [15]:
Text_data = pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Text_data.csv')
In [310]:
nlp.max_length = 6500000
In [317]:
processed_text = { 'Issuer/Borrower PermID': [] , 'content': [] }
processed_text['Issuer/Borrower PermID'] = Text_data['Issuer/Borrower PermID']

Clean the text:

In [318]:
for i in tqdm.tqdm(range(len(Text_data['content']))):
    if pd.isnull(Text_data['content'][i]):
        processed_text['content'].append('nan')

    else:
        processed_text['content'].append(nlp(

            re.sub(r'\d+\.?\,?\d*\s{1}\d+\.?\,?\d*',"",                                                                        ## Leer zwischen Dezimalzahlen entfernen
            re.sub(r'(\. )([a-z])',r' \2', re.sub(r'\.{2,}',' ', re.sub(r'\d\.\s+|\b[a-z]\)\s+|•\s+|●\s+|\[[^\]]*\]|○\s+|[A-Z]\.\s+|[IVX]+\.\s+/g', ". " , re.sub(r"\S*https?:\S*", "",
            re.sub("([\u4e00-\u9fa5])+","", Text_data['content'][i])).replace(' . ', '.'))).replace(';','.')).replace('-'," ").replace("  ", " ").replace(". , ",", ")).replace('(', '').replace(')', '').replace('“', '')\
                                                   .replace('”', '').replace('‘', '').replace('’', '').replace('`', '').replace('− ', '').replace('- ', '').replace('_', '').replace('  ', ' ')\
                                                   .replace('cid','').replace('\\', '')
        ))
100%|██████████| 221/221 [00:32<00:00,  6.88it/s]
In [320]:
csv_save_processed = pd.DataFrame(processed_text)
csv_save_processed.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', encoding='utf-8',index=False)

START HERE WHEN USING ALREADY CLEANED EXCEL DATA

In [257]:
csv_save_processed = pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', converters={'content': lambda x: str(x)})
In [258]:
NLP_results =  pd.DataFrame()
NLP_results['filename'] = csv_save_processed['filename']
In [259]:
NLP_results.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)
csv_save_processed.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)

Add the number of word as a variable

In [260]:
words=[]
for i in range(len(csv_save_processed['content'])):
    words.append(len(csv_save_processed['content'][i]))

NLP_results['words'] = words

del words

Devide the whole text into sentences

In [224]:
nlp.max_length = 6500000

sentences = []

for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
    sentences.append([sent.text.strip() for sent in nlp(csv_save_processed['content'][i]).sents])
100%|██████████| 221/221 [00:38<00:00,  5.68it/s]
In [261]:
csv_save_processed['sentences']=sentences

Count the number of sentences

In [262]:
no_sent=[]
for i in range(len(sentences)):
    no_sent.append(len(sentences[i]))

NLP_results['sentences'] = no_sent

del no_sent
In [94]:
#csv_save_processed.to_csv('Processed_Text.csv', encoding='utf-8',index=False)
In [85]:
#csv_save_processed=pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', converters={'content': lambda x: str(x)})

Start Natural-Language-Processing¶

Load the list of COVID-Related Terms found on the internet and provided by UVA Health

In [227]:
cov_terms = pd.read_excel(r'H:\IPO Sentiment\Code\Data\Covid_Related_Terms (1).xlsx', skiprows=[0,1], converters={'TERM': lambda x: str(x)})
In [ ]:
csv_save_processed['content'][12] = np.nan
csv_save_processed['content'][217] = np.nan

Count the amount of COVID-related words

In [264]:
cov=[]
add=0
for i in  tqdm.tqdm(range(len(csv_save_processed['content']))):

    if pd.isnull(csv_save_processed['content'][i]):
        senti= np.nan
        cov.append(senti)

    else:
        for k in range(len(cov_terms)):
            count= len(re.findall(r"\b" + re.escape(cov_terms['TERM'][k].lower()) + r"\b", csv_save_processed['content'][i].lower()))  #csv_save_processed['content'][i].lower().count(" " + cov_terms['TERM'][k].lower() + " ")
            add = add + count
        senti = add/NLP_results['words'][i]
        cov.append(senti)

NLP_results['pandemic words'] = cov
100%|██████████| 221/221 [22:41<00:00,  6.16s/it]
In [331]:
#NLP_results.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results_pandemic_words.csv', encoding='utf-8',index=False)

The Loughran McDonald Dictionary is a well implemented dictionary in NLP listing a range of words reflecting e.g. uncertainty, complexity or constraints. Those words can be counted as well.

In [230]:
L_Mc_dict = pd.read_excel('H:\\IPO Sentiment\\Code\\Data\\LoughranMcDonald_MasterDictionary_2020.xlsx')
L_Mc_dict['Word']=L_Mc_dict['Word'].astype(str) 
In [265]:
negative = L_Mc_dict[L_Mc_dict["Negative"]!=0]['Word'].str.lower().tolist()
positive = L_Mc_dict[L_Mc_dict["Positive"]!=0]['Word'].str.lower().tolist()

uncertainty =  L_Mc_dict[L_Mc_dict["Uncertainty"]!=0]['Word'].str.lower().tolist()
litigious = L_Mc_dict[L_Mc_dict["Litigious"]!=0]['Word'].str.lower().tolist()
strong_modal = L_Mc_dict[L_Mc_dict["Strong_Modal"]!=0]['Word'].str.lower().tolist()
weak_modal = L_Mc_dict[L_Mc_dict["Weak_Modal"]!=0]['Word'].str.lower().tolist()
constraining = L_Mc_dict[L_Mc_dict["Constraining"]!=0]['Word'].str.lower().tolist()
complexity = L_Mc_dict[L_Mc_dict["Complexity"]!=0]['Word'].str.lower().tolist()
negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
          "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
          "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
          "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
          "werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
          "wouldn't", "rarely", "seldom", "despite", "no", "nobody"]
In [266]:
senti_dict = {'Negative': negative , 'Positive': positive , 'Uncertainty': uncertainty, 'Constraining': constraining}

How many percent of uncertain words are part of negative words?¶

In [267]:
def negated(word):
   
    if word.lower() in negate:
        return True
    else:
        return False
In [268]:
def tone_count_with_negation_check(dict, article):
    """
    Count positive and negative words with negation check. Account for simple negation only for positive words.
    Simple negation is taken to be observations of one of negate words occurring within three words
    preceding a positive words.
    """
    pos_count = 0
    neg_count = 0
 
    pos_words = []
    neg_words = []
 
    input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
 
    word_count = len(input_words)
 
    for i in range(0, word_count):
        if input_words[i] in dict['Negative']:
            neg_count += 1
            neg_words.append(input_words[i])
        if input_words[i] in dict['Positive']:
            if i >= 3:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 2:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 1:
                if negated(input_words[i - 1]):
                    neg_count += 1
                    neg_words.append(input_words[i] + ' (with negation)')
                else:
                    pos_count += 1
                    pos_words.append(input_words[i])
            elif i == 0:
                pos_count += 1
                pos_words.append(input_words[i])
 
    #print('The results with negation check:', end='\n\n')
    #print('The # of positive words:', pos_count)
    #print('The # of negative words:', neg_count)
    #print('The list of found positive words:', pos_words)
    #print('The list of found negative words:', neg_words)
    #print('\n', end='')
 
    results = [word_count, pos_count, neg_count, pos_words, neg_words]
 
    return results
In [269]:
def uncert_constrain_count_with_negation_check(dict, article):
    """
    Count positive and negative words with negation check. Account for simple negation only for positive words.
    Simple negation is taken to be observations of one of negate words occurring within three words
    preceding a positive words.
    """
    const_count = 0
    uncert_count = 0
 
    const_words = []
    uncert_words = []
 
    input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
 
    word_count = len(input_words)
 
    for i in range(0, word_count):
        if input_words[i] in dict['Uncertainty']:
            if i >= 3:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
                    continue
                else:
                    uncert_count += 1
                    uncert_words.append(input_words[i])
            elif i == 2:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]):
                    continue
                else:
                    uncert_count += 1
                    uncert_words.append(input_words[i])
            elif i == 1:
                if negated(input_words[i - 1]):
                    continue
                else:
                    uncert_count += 1
                    uncert_words.append(input_words[i])
            elif i == 0:
                uncert_count += 1
                uncert_words.append(input_words[i])

        if input_words[i] in dict['Constraining']:
            if i >= 3:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
                    continue
                else:
                    const_count += 1
                    const_words.append(input_words[i])
            elif i == 2:
                if negated(input_words[i - 1]) or negated(input_words[i - 2]):
                    continue
                else:
                    const_count += 1
                    const_words.append(input_words[i])
            elif i == 1:
                if negated(input_words[i - 1]):
                    continue
                else:
                    const_count += 1
                    const_words.append(input_words[i])
            elif i == 0:
                const_count += 1
                const_words.append(input_words[i])
 
    #print('The results with negation check:', end='\n\n')
    #print('The # of positive words:', pos_count)
    #print('The # of negative words:', neg_count)
    #print('The list of found positive words:', pos_words)
    #print('The list of found negative words:', neg_words)
    #print('\n', end='')
 
    results = [word_count, uncert_count, const_count, uncert_words, const_words]
 
    return results
In [270]:
csv_save_processed.drop([12,217], inplace=True)
NLP_results.drop([12,217], inplace=True)
In [271]:
csv_save_processed.reset_index(inplace=True)
NLP_results.reset_index(inplace=True)
In [272]:
senti_res = []

for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
    
    senti_res.append(tone_count_with_negation_check(senti_dict, csv_save_processed['content'][i]))
100%|██████████| 219/219 [04:07<00:00,  1.13s/it]
In [273]:
uncert_const_res = []

for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
    
    uncert_const_res.append(uncert_constrain_count_with_negation_check(senti_dict, csv_save_processed['content'][i]))
100%|██████████| 219/219 [00:36<00:00,  6.03it/s]
In [274]:
NLP_results['positive'] = pd.DataFrame( senti_res[0:], columns=['words','pos','neg','pos_words','neg_words'])['pos']
NLP_results['negative'] = pd.DataFrame( senti_res[0:], columns=['words','pos','neg','pos_words','neg_words'])['neg']
NLP_results['uncertainty'] = pd.DataFrame( uncert_const_res[0:], columns=['words','uncertain','constraining','uncert_words','const_words'])['uncertain']
NLP_results['constraining'] = pd.DataFrame( uncert_const_res[0:], columns=['words','uncertain','constraining','uncert_words','const_words'])['constraining']
In [275]:
NLP_results['sentiment'] = (NLP_results['positive'] - NLP_results['negative']) / NLP_results['words']
NLP_results['positive'] = NLP_results['positive'] / NLP_results['words']
NLP_results['negative'] = NLP_results['negative'] / NLP_results['words']
NLP_results['uncertainty'] = NLP_results['uncertainty'] / NLP_results['words']
NLP_results['constraining'] = NLP_results['constraining'] / NLP_results['words']
In [345]:
#NLP_results.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results.csv', encoding='utf-8',index=False)
In [83]:
#NLP_results=pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results.csv')

Visualizse COVID measure and Sentiment

In [22]:
IPOs = pd.read_excel("H:\\IPO Sentiment\\Code\\Data\\IPOs since 2017.xlsx", converters={'CIK': lambda x: str(x)})
In [276]:
NLP_results = NLP_results.merge(IPOs[['Issuer/Borrower PermID', 'Issuer/Borrower Nation']], on='Issuer/Borrower PermID')
In [242]:
import matplotlib.pyplot as plt
In [307]:
NLP_results['Issuer/Borrower Nation'].unique()
Out[307]:
array(['Germany', 'India', 'Norway', 'Hong Kong', 'Singapore', 'Spain',
       'United Kingdom', 'United Arab Emirates', 'Bahrain',
       'China (Mainland)', 'France', 'United States', 'Finland',
       'Denmark', 'Guernsey', 'Portugal', 'Cambodia', 'Netherlands',
       'Turkey', 'Sweden', 'Saudi Arabia', 'South Korea', 'Austria',
       'Russia', 'Iceland', 'Ireland', 'Bangladesh', 'Malaysia',
       'Australia', 'Luxembourg', 'Qatar', 'Chile', 'Canada',
       'Cayman Islands', 'Jersey', 'Switzerland'], dtype=object)
In [279]:
region = []

for i in range(len(NLP_results)):
    if NLP_results['Issuer/Borrower Nation'][i] == 'South Africa' or NLP_results['Issuer/Borrower Nation'][i] == 'Ghana' or NLP_results['Issuer/Borrower Nation'][i] == 'Tanzania' or NLP_results['Issuer/Borrower Nation'][i] == 'Togo'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Ivory Coast' or NLP_results['Issuer/Borrower Nation'][i] == 'Nigeria' or NLP_results['Issuer/Borrower Nation'][i] == 'Morocco' or NLP_results['Issuer/Borrower Nation'][i] == 'Mozambique'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Namibia' or NLP_results['Issuer/Borrower Nation'][i] == 'Uganda' or NLP_results['Issuer/Borrower Nation'][i] == 'Malawi' or NLP_results['Issuer/Borrower Nation'][i] == 'Tunisia'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Zambia' or NLP_results['Issuer/Borrower Nation'][i] == 'Rwanda' or NLP_results['Issuer/Borrower Nation'][i] == 'Mauritius':
        region.append('Africa')
    
    elif NLP_results['Issuer/Borrower Nation'][i] == 'Japan' or NLP_results['Issuer/Borrower Nation'][i] == 'China (Mainland)' or NLP_results['Issuer/Borrower Nation'][i] == 'Hong Kong' or NLP_results['Issuer/Borrower Nation'][i] == 'South Korea'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'India' or NLP_results['Issuer/Borrower Nation'][i] == 'Thailand' or NLP_results['Issuer/Borrower Nation'][i] == 'Singapore' or NLP_results['Issuer/Borrower Nation'][i] == 'Indonesia'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Vietnam' or NLP_results['Issuer/Borrower Nation'][i] == 'Philippines' or NLP_results['Issuer/Borrower Nation'][i] == 'Malaysia' or NLP_results['Issuer/Borrower Nation'][i] == 'Taiwan'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Maldives' or NLP_results['Issuer/Borrower Nation'][i] == 'Cambodia' or NLP_results['Issuer/Borrower Nation'][i] == 'Nepal' or NLP_results['Issuer/Borrower Nation'][i] == 'Bangladesh'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Macau' or NLP_results['Issuer/Borrower Nation'][i] == 'Sri Lanka' or NLP_results['Issuer/Borrower Nation'][i] == 'Pakistan' or NLP_results['Issuer/Borrower Nation'][i] == 'Mongolia'\
    or NLP_results['Issuer/Borrower Nation'][i] == 'Laos' or NLP_results['Issuer/Borrower Nation'][i] == 'Myanmar' or NLP_results['Issuer/Borrower Nation'][i] == 'Kazakhstan':
        region.append('Asia')
    
    elif NLP_results['Issuer/Borrower Nation'][i] == 'Germany' or NLP_results['Issuer/Borrower Nation'][i] == 'Poland' or NLP_results['Issuer/Borrower Nation'][i] == 'Netherlands' or NLP_results['Issuer/Borrower Nation'][i] == 'Italy'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Luxembourg' or NLP_results['Issuer/Borrower Nation'][i] == 'United Kingdom' or NLP_results['Issuer/Borrower Nation'][i] == 'Switzerland' or NLP_results['Issuer/Borrower Nation'][i] == 'Sweden'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Norway' or NLP_results['Issuer/Borrower Nation'][i] == 'France' or NLP_results['Issuer/Borrower Nation'][i] == 'Austria' or NLP_results['Issuer/Borrower Nation'][i] == 'Belgium'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Spain' or NLP_results['Issuer/Borrower Nation'][i] == 'Russia' or NLP_results['Issuer/Borrower Nation'][i] == 'Finland' or NLP_results['Issuer/Borrower Nation'][i] == 'Turkey'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Slovenia' or NLP_results['Issuer/Borrower Nation'][i] == 'Ireland' or NLP_results['Issuer/Borrower Nation'][i] == 'Denmark' or NLP_results['Issuer/Borrower Nation'][i] == 'Lithuania'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Iceland' or NLP_results['Issuer/Borrower Nation'][i] == 'Cyprus' or NLP_results['Issuer/Borrower Nation'][i] == 'Isle of Man' or NLP_results['Issuer/Borrower Nation'][i] == 'Estonia'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Portugal' or NLP_results['Issuer/Borrower Nation'][i] == 'Liechtenstein' or NLP_results['Issuer/Borrower Nation'][i] == 'Hungary' or NLP_results['Issuer/Borrower Nation'][i] == 'Romania'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Malta' or NLP_results['Issuer/Borrower Nation'][i] == 'Bulgaria' or NLP_results['Issuer/Borrower Nation'][i] == 'Czech Republic' or NLP_results['Issuer/Borrower Nation'][i] == 'Croatia'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Latvia' or NLP_results['Issuer/Borrower Nation'][i] == 'Serbia' or NLP_results['Issuer/Borrower Nation'][i] == 'Jersey':
            region.append('Europe')

    elif NLP_results['Issuer/Borrower Nation'][i] == 'Saudi Arabia' or NLP_results['Issuer/Borrower Nation'][i] == 'Iran' or NLP_results['Issuer/Borrower Nation'][i] == 'United Arab Emirates' or NLP_results['Issuer/Borrower Nation'][i] == 'Kuwait'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'India' or NLP_results['Issuer/Borrower Nation'][i] == 'Qatar' or NLP_results['Issuer/Borrower Nation'][i] == 'Oman' or NLP_results['Issuer/Borrower Nation'][i] == 'Bahrain'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Jordan' or NLP_results['Issuer/Borrower Nation'][i] == 'Israel' :
            region.append('Middle East')

    elif NLP_results['Issuer/Borrower Nation'][i] == 'United States' or NLP_results['Issuer/Borrower Nation'][i] == 'Canada' or NLP_results['Issuer/Borrower Nation'][i] == 'Mexico' or NLP_results['Issuer/Borrower Nation'][i] == 'Bahamas'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Bermuda' or NLP_results['Issuer/Borrower Nation'][i] == 'British Virgin Islands':
            region.append('North America')

    elif NLP_results['Issuer/Borrower Nation'][i] == 'Chile' or NLP_results['Issuer/Borrower Nation'][i] == 'Argentina' or NLP_results['Issuer/Borrower Nation'][i] == 'Peru' or NLP_results['Issuer/Borrower Nation'][i] == 'Brazil'\
        or NLP_results['Issuer/Borrower Nation'][i] == 'Bermuda' or NLP_results['Issuer/Borrower Nation'][i] == 'British Virgin Islands':
            region.append('South America') 
    
    else:
        region.append('Rest') 
In [280]:
NLP_results['region'] = region

Divide Dataset into pre- and post COVID by IPO Date¶

In [281]:
NLP_results = NLP_results.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
In [282]:
NLP_results['Issue Date'] = NLP_results['Issue Date'].apply(lambda x: x.date())
In [316]:
split_date = pd.datetime(2020,1,1).date()

results_pre = NLP_results.loc[NLP_results['Issue Date'] <= split_date]
results_post = NLP_results.loc[NLP_results['Issue Date'] > split_date]
In [363]:
import matplotlib.gridspec as gridspec
import datetime
In [446]:
plot_data_name= ['constraining','uncertainty','sentiment','finbert sentiment']
plot_lines= [[0.00125,0.00125],[0.0055,0.0055],[-0.004,-0.004],[-0.35,-0.35]]
In [448]:
percent_calc_pre = [
    sum(results_pre['constraining']>=plot_lines[0][0])/len(results_pre['constraining']), 
    sum(results_pre['uncertainty']>=plot_lines[1][0])/len(results_pre['uncertainty']), 
    sum(results_pre['sentiment']>=plot_lines[2][0])/len(results_pre['sentiment']),
    sum(results_pre['finbert sentiment']>=plot_lines[3][0])/len(results_pre['finbert sentiment'])
]

percent_calc_post = [
    sum(results_post['constraining']>=plot_lines[0][0])/len(results_post['constraining']),
    sum(results_post['uncertainty']>=plot_lines[1][0])/len(results_post['uncertainty']),
    sum(results_post['sentiment']>=plot_lines[2][0])/len(results_post['sentiment']),
    sum(results_post['finbert sentiment']>=plot_lines[3][0])/len(results_post['finbert sentiment'])
]
In [536]:
time_trends_post = pd.DataFrame(columns=["Date",'uncertainty','constraining','sentiment','finbert sentiment'])
time_trends_post['Date'] = np.sort(results_post['Issue Date'].unique())
In [540]:
for j in range(len(plot_data_name)):
    trends = []
    for i in range(len(np.sort(results_post['Issue Date'].unique()))):
        trends.append(sum(results_post[results_post['Issue Date'] == np.sort(results_post['Issue Date'].unique())[i]][plot_data_name[j]])/len(results_post[results_post['Issue Date'] == np.sort(results_post['Issue Date'].unique())[i]][plot_data_name[j]]))
    time_trends_post[plot_data_name[j]] = trends
In [504]:
rows = 4
cols = 1

fig = plt.figure(figsize=(20, 25), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)

for i in range(rows * cols):
    # create fake subplot just to title pair of subplots
    fake = fig.add_subplot(grid[i])
    #  '\n' is important
    fake.set_title(['Constraining Word Ratio','Uncertainty Word Ratio', 'LMD Sentiment Ratio','Finbert Sentiment Ratio'][i], fontweight='semibold', size=16)
    fake.set_axis_off()

    # create subgrid for two subplots without space between them
    # <https://matplotlib.org/2.0.2/users/gridspec.html>
    gs = gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=grid[i], wspace=0.05)

    # real subplot #1
    ax = fig.add_subplot(gs[0])
    ax.set_title(['Pre-COVID','Pre-COVID','Pre-COVID', 'Pre-COVID'][i])
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=False)
    ax.scatter(results_pre['Issue Date'], results_pre[plot_data_name[i]], s =35, c = 'black')
    #ax.plot([min(results_pre['Issue Date']), max(results_pre['Issue Date'])], plot_lines[i])
    plt.axhspan(plot_lines[i][0], [0.0021,0.0068,-0.00045,-0.15][i], facecolor='0.2', alpha=0.2)
    plt.text(min(results_pre['Issue Date']), [0.0019,0.0064,-0.0011,-0.19][i], '{0:.0%}'.format(percent_calc_pre[i]) + ' of the Data', color="blue", fontsize=16)
    plt.ylim(min(results_post[plot_data_name[i]])-[0.0001,0.0002,0.0005,0.02][i], max(results_post[plot_data_name[i]])+[0.0002,0.0004,0.0003,0.04][i])
    plt.ylabel(['Constraining Word Ratio','Uncertainty Word Ratio', 'LMD Sentiment Ratio','Finbert Sentiment Ratio'][i])

    ax = fig.add_subplot(gs[2])
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=False)
    sns.histplot(data=results_pre[plot_data_name[i]], kde=True, stat='density')
    plt.ylim([0,0,0,0][i], [2300,1150,650,6.2][i])
    plt.xlim([0.0004,0.003,-0.0057,-0.51][i], [0.0021,0.008,-0.001,-0.165][i])
    plt.ylabel("Density")
    plt.xlabel("")

    # real subplot #2
    ax = fig.add_subplot(gs[1])
    ax.set_title(['COVID','COVID','COVID','COVID'][i])
    # hide ticks and labels
    ax.tick_params(left=False, labelleft=False, labelbottom=True, bottom=False)
    ax.scatter(results_post['Issue Date'], results_post[plot_data_name[i]], s =35, c = 'black')
    #ax.plot([min(results_post['Issue Date']), max(results_post['Issue Date'])], plot_lines[i])
    plt.axhspan(plot_lines[i][0], [0.0021,0.0068,-0.00045,-0.15][i], facecolor='0.2', alpha=0.2)
    plt.text(min(results_post['Issue Date']+datetime.timedelta(days=4)), [0.0019,0.0064,-0.0011,-0.19][i], '{0:.0%}'.format(percent_calc_post[i]) + ' of the Data', color="blue", fontsize=16)
    plt.ylim(min(results_post[plot_data_name[i]])-[0.0001,0.0002,0.0005,0.02][i], max(results_post[plot_data_name[i]])+[0.0002,0.0004,0.0003,0.04][i])


    ax = fig.add_subplot(gs[3])
    # hide ticks and labels
    ax.tick_params(left= False, labelleft=False, labelbottom=True, bottom=False)
    sns.histplot(data=results_post[plot_data_name[i]], kde=True, stat='density' )
    plt.ylim([0,0,0,0][i], [2300,1150,650,6.2][i])
    plt.xlim([0.0004,0.003,-0.0057,-0.51][i], [0.0021,0.008,-0.001,-0.165][i])
    plt.ylabel("")
    plt.xlabel("")


fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Overview of Wordcounts', fontweight='bold', size=16)
fig.tight_layout()
No description has been provided for this image
In [288]:
import seaborn as sns
In [289]:
plot2_axis_labels= [['LMD Constraining Ratio','Pandemic Word Ratio'],['LMD Uncertainty Ratio', 'Pandemic Word Ratio'],['LMD Sentiment Ratio','Pandemic Word Ratio']]
In [ ]:
 
In [290]:
rows = 3
cols = 1

fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)

for i in range(rows * cols):
    # create fake subplot just to title pair of subplots
    fake = fig.add_subplot(grid[i])
    #  '\n' is important
    fake.set_title(['Constraining Words:','Uncertain Words:','Sentiment:'][i], fontweight='bold', size=16, loc='left')
    fake.set_axis_off()

    # create subgrid for two subplots without space between them
    # <https://matplotlib.org/2.0.2/users/gridspec.html>
    gs = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=grid[i], wspace=0.05)

    # real subplot #1
    ax = fig.add_subplot(gs[0])
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=True)
    ax.axis([min(results_pre['pandemic words']),max(results_pre['pandemic words']),min(results_pre[plot_data_name[i]]),max(results_pre[plot_data_name[i]])])
    sns.regplot(x='pandemic words',y=plot_data_name[i], data = results_pre, line_kws={'color': 'black'})
    plt.xlabel(plot2_axis_labels[i][1], size=12)
    plt.ylabel(plot2_axis_labels[i][0], size=12)



fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Overview Wordcounts', fontweight='bold', size=16)
fig.tight_layout()
No description has been provided for this image
In [292]:
from scipy import stats
Out[292]:
(0.3626434742394554, 3.2919953908863184e-08)
In [295]:
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['sentiment'])))
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['uncertainty'])))
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['constraining'])))
(0.3626434742394554, 3.2919953908863184e-08)
(0.3168112250248351, 1.7065657902013273e-06)
(-0.14130159240863932, 0.03665360879070792)
In [296]:
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['sentiment'])))
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['uncertainty'])))
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['constraining'])))
SpearmanrResult(correlation=0.3044901300427793, pvalue=4.438645404602572e-06)
SpearmanrResult(correlation=0.24170948441895618, pvalue=0.0003059827524162565)
SpearmanrResult(correlation=-0.11424135245163458, pvalue=0.09170563417495373)
In [301]:
f = plt.figure(figsize=(19, 15))
plt.matshow(corrMatrix.corr(), fignum=f.number)
plt.xticks(range(corrMatrix.select_dtypes(['number']).shape[1]), corrMatrix.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(corrMatrix.select_dtypes(['number']).shape[1]), corrMatrix.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
No description has been provided for this image

FIN-BERT Model for Sentiment

In [310]:
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
In [311]:
sentences2 = []
for j in range(len(csv_save_processed['sentences'])):
    sentences2.append([i for (i,v) in zip(csv_save_processed['sentences'][j], [x.count(' ')>400 for x in csv_save_processed['sentences'][j]]) if not v])

csv_save_processed['sentences2'] = sentences2
In [312]:
senti = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

results = []

for i in  tqdm.tqdm(range(len(csv_save_processed['sentences2']))):
    
    results.append(senti(csv_save_processed['sentences2'][i])) #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative
       
100%|██████████| 219/219 [4:34:40<00:00, 75.25s/it]   
In [313]:
pos = []
neg = []
Finbert_res = pd.DataFrame(NLP_results[['Issuer/Borrower PermID','sentences']])

for i in range(len(results)):    
    pos.append(len([i for i in results[i] if i['label'] == 'positive']))

for j in range(len(results)):    
    neg.append(len([i for i in results[j] if i['label'] == 'negative']))


Finbert_res['finbert positive'] = pos
Finbert_res['finbert negative'] = neg
In [314]:
Finbert_res['finbert sentiment'] = (Finbert_res['finbert positive'] - Finbert_res['finbert negative']) / Finbert_res['sentences']
In [66]:
%store Finbert_res
Stored 'results' (list)
In [304]:
%store -r Finbert_res
In [315]:
NLP_results = NLP_results.merge(Finbert_res[['Issuer/Borrower PermID', 'finbert sentiment']], on='Issuer/Borrower PermID')
In [122]:
NLP_results = NLP_results.drop([12,217])
In [934]:
finbert_plotdata_names1 = ['constraining','uncertainty']
finbert_plotdata_names2 = ['pandemic words', 'sentiment']
In [935]:
rows = 2
cols = 1

fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)

for i in range(rows * cols):
    # create fake subplot just to title pair of subplots
    fake = fig.add_subplot(grid[i])
    #  '\n' is important
    #fake.set_title(finbert_plotdata_names, fontweight='semibold', size=14)
    fake.set_axis_off()

    # create subgrid for two subplots without space between them
    # <https://matplotlib.org/2.0.2/users/gridspec.html>
    gs = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=grid[i], wspace=0.1)

    # real subplot #1
    ax = fig.add_subplot(gs[0])
    ax.set_title(['Constraining Words:','Uncertainty Words:'][i], fontweight='bold', size=16, loc='left')
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
    ax.axis([min(results_post['finbert sentiment']),max(results_post['finbert sentiment']),min(results_post[finbert_plotdata_names1[i]]),max(results_post[finbert_plotdata_names1[i]])])
    sns.regplot(x='finbert sentiment', y=finbert_plotdata_names1[i], data = results_post, line_kws={'color': 'black'})
    plt.xlabel('Finbert Sentiment Ratio', size=12)
    plt.ylabel(['LMD Constraining Ratio','LMD Uncertainty Ratio'][i], size=12)
    

    # real subplot #2
    ax = fig.add_subplot(gs[1])
    ax.set_title(['Pandemic Words:','LMD Sentiment:'][i], fontweight='bold', size=16, loc='left')
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
    ax.axis([min(results_post['finbert sentiment']),max(results_post['finbert sentiment']),min(results_post[finbert_plotdata_names2[i]]),max(results_post[finbert_plotdata_names2[i]])])
    sns.regplot(x='finbert sentiment',y=finbert_plotdata_names2[i], data = results_post, line_kws={'color': 'black'})
    plt.xlabel('Finbert Sentiment Ratio', size=12)
    plt.ylabel(['Pandemic Words Ratio','LMD Sentiment'][i], size=12)


fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Comparision between Wordcounts and FinBert Sentiment', fontweight='bold', size=16)
fig.tight_layout()
No description has been provided for this image
In [936]:
finbert_plotdata_names1 = ['constraining','uncertainty']
finbert_plotdata_names2 = ['pandemic words', 'finbert sentiment']
In [939]:
rows = 2
cols = 1

fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)

for i in range(rows * cols):
    # create fake subplot just to title pair of subplots
    fake = fig.add_subplot(grid[i])
    #  '\n' is important
    #fake.set_title(finbert_plotdata_names, fontweight='semibold', size=14)
    fake.set_axis_off()

    # create subgrid for two subplots without space between them
    # <https://matplotlib.org/2.0.2/users/gridspec.html>
    gs = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=grid[i], wspace=0.1)

    # real subplot #1
    ax = fig.add_subplot(gs[0])
    ax.set_title(['Constraining Words:','Uncertainty Words:'][i], fontweight='bold', size=16, loc='left')
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
    ax.axis([min(results_post['sentiment']),max(results_post['sentiment']),min(results_post[finbert_plotdata_names1[i]]),max(results_post[finbert_plotdata_names1[i]])])
    sns.regplot(x='sentiment', y=finbert_plotdata_names1[i], data = results_post, line_kws={'color': 'black'})
    plt.xlabel('LMD Sentiment Ratio', size=12)
    plt.ylabel(['LMD Constraining Ratio','LMD Uncertainty Ratio'][i], size=12)
    

    # real subplot #2
    ax = fig.add_subplot(gs[1])
    ax.set_title(['Pandemic Words:','Finbert Sentiment:'][i], fontweight='bold', size=16, loc='left')
    # hide ticks and labels
    ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
    ax.axis([min(results_post['sentiment']),max(results_post['sentiment']),min(results_post[finbert_plotdata_names2[i]]),max(results_post[finbert_plotdata_names2[i]])])
    sns.regplot(x='sentiment',y=finbert_plotdata_names2[i], data = results_post, line_kws={'color': 'black'})
    plt.xlabel('LMD Sentiment Ratio', size=12)
    plt.ylabel(['Pandemic Words Ratio','Finbert Sentiment'][i], size=12)
    

fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Comparision between Wordcounts and LMD Sentiment Sentiment', fontweight='bold', size=16)
fig.tight_layout()
No description has been provided for this image

Topic Model¶

In [14]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
In [15]:
Text_TM = csv_save_processed

Test if datatype is correct

In [16]:
#isinstance(item, float) for item in Text_TM['content']

#all(isinstance(item, str) for item in test)
test = []
test2 = []
for i in range(len(Text_TM)):
    test.append(type(Text_TM['content'][i]))

for i in range(len(test)):
    if test[i]!=str:
        test2.append(i)
    else:
        pass
In [17]:
test2
Out[17]:
[]
In [18]:
Text_TM[Text_TM['content'] == '']
Out[18]:
Issuer/Borrower PermID content sentences
12 4296140188 []
217 5081364438 []

Drop empty rows

In [19]:
Text_TM= Text_TM.drop([12,217])
In [20]:
Text_TM= Text_TM.reindex()
In [23]:
Text_TM.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)

Text_TM = Text_TM.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')

Text_TM['Issue Date'] = Text_TM['Issue Date'].apply(lambda x: x.date())
In [ ]:
split_date = pd.datetime(2020,1,1).date()

TM_pre = Text_TM.loc[Text_TM['Issue Date'] <= split_date]
TM_post = Text_TM.loc[Text_TM['Issue Date'] > split_date]

Further pre-processing of text

In [25]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '#!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@₹€$£—–'

# cleaning master function
def clean_Text_data_TopicM(Text_data_TopicM, bigrams=False):

    Text_data_TopicM = Text_data_TopicM.lower() # lower case
    Text_data_TopicM = re.sub('['+my_punctuation + ']+', ' ', Text_data_TopicM) # strip punctuation
    Text_data_TopicM = re.sub('([0-9]+)', '', Text_data_TopicM) # remove numbers#
    Text_data_TopicM = re.sub(r'\beuro\b|\beur\b|usd|rmb|rm|renminbi|cent|million|prc', '', Text_data_TopicM) #currencies and money related terms
    Text_data_TopicM = re.sub(r'hong|kong|hk|cayman|uk|indian|india|indic|\beu\b|china|germani|singapor|united|kingdom|\bus\b|china|chinese|european|mofcom', '', Text_data_TopicM) #countries or country institutions
    Text_data_TopicM = re.sub(r'\bi\b | \bii\b | \biii\b| \biv\b| \bv\b| \bvi\b| \bvii\b| \bx\b| \bxi\b', '', Text_data_TopicM) #bullets
    Text_data_TopicM = re.sub(r'prospectus|see|section| ordinary |\bmr\b |\bmrs\b|class|stock| shares', '', Text_data_TopicM) #frequently used words in prospectus, which could distort
    Text_data_TopicM = re.sub('\s+', ' ', Text_data_TopicM) #remove double spacing
    Text_data_TopicM_token_list = [word for word in Text_data_TopicM.split(' ')
                            if word not in my_stopwords] # remove stopwords

    Text_data_TopicM_token_list = [word_rooter(word) if '#' not in word else word
                        for word in Text_data_TopicM_token_list] # apply word rooter
    if bigrams:
        Text_data_TopicM_token_list = Text_data_TopicM_token_list+[Text_data_TopicM_token_list[i]+'_'+Text_data_TopicM_token_list[i+1]
                                            for i in range(len(Text_data_TopicM_token_list)-1)]
    Text_data_TopicM = ' '.join(Text_data_TopicM_token_list)
    return Text_data_TopicM
In [ ]:
TM_pre['content'] = TM_pre.content.apply(clean_Text_data_TopicM)

TM_post['content'] = TM_post.content.apply(clean_Text_data_TopicM)
In [27]:
#second cleaning step

def clean_Text_data_TopicM_2(Text_data_TopicM, bigrams=False):

    Text_data_TopicM = re.sub( r"\ber\b | \bu\b | \bha\b| \btrt\b | \bl\b | \be\b | \bb\b | \bc\b" , '', Text_data_TopicM) 

    return Text_data_TopicM
In [ ]:
TM_pre['content'] = TM_pre.content.apply(clean_Text_data_TopicM_2)

TM_post['content'] = TM_post.content.apply(clean_Text_data_TopicM_2)

Calculate Topic Model

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf_pre = vectorizer.fit_transform(TM_pre['content'] ).toarray()
tf_post = vectorizer.fit_transform(TM_post['content'] ).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()
In [110]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 5
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
In [111]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)
In [112]:
model_pre = model.fit(tf_pre)
no_top_words = 25
topics_pre= display_topics(model_pre, tf_feature_names, no_top_words)
In [113]:
topics_pre
Out[113]:
Topic 0 words Topic 0 weights Topic 1 words Topic 1 weights Topic 2 words Topic 2 weights Topic 3 words Topic 3 weights Topic 4 words Topic 4 weights
0 composit 1156.2 liabl 1550.5 composit 1355.4 rapid 1057.2 liabl 1919.0
1 attack 882.8 progress 663.5 placement 646.3 defer 861.4 length 1079.7
2 possess 487.6 qualiti 536.0 discov 568.3 defin 751.9 forecast 776.5
3 concern 475.8 januari 448.8 pipelin 558.9 billion 719.4 clean 556.0
4 go 456.1 length 406.5 dee 524.8 code 620.3 monetari 403.6
5 faith 356.8 oversea 379.4 possess 468.5 gdpr 545.1 go 392.6
6 destroy 334.1 cancel 378.7 opinion 394.5 monetari 512.2 attack 318.2
7 burden 312.7 defin 321.6 hack 372.9 intang 510.7 defer 315.9
8 lesser 265.3 magnitud 314.5 bankruptci 351.7 invalid 509.9 perag 312.1
9 interact 249.3 accru 279.6 clearanc 340.6 liabl 501.5 amongst 301.6
10 memorandum 242.3 misappropri 278.6 loyalti 304.2 familiar 473.6 privaci 298.8
11 institut 232.7 equal 277.9 deliveri 303.6 perag 435.7 emptiv 276.2
12 expert 226.0 depreci 277.7 passiv 296.7 concern 413.8 composit 272.6
13 confi 223.1 confi 257.6 destroy 291.9 citizen 410.1 magnitud 265.5
14 impair 218.5 pool 245.7 negoti 281.0 cancel 407.1 principl 259.0
15 geani 213.8 fraud 223.5 mostli 271.4 equal 379.1 depreci 256.8
16 instanc 213.7 movement 212.1 confi 260.9 indirectli 374.1 equival 242.9
17 leakag 209.5 re 209.9 join 257.9 late 372.7 ad 238.2
18 outbreak 208.9 interact 196.9 intensifi 238.9 possess 361.3 contrari 214.7
19 insuffici 199.8 devic 194.5 regist 235.1 broad 354.6 regist 213.9
20 asia 195.4 deliveri 170.7 familiar 232.9 necessarili 348.7 broad 210.7
21 headcount 195.0 choos 161.7 efficaci 232.8 administr 331.0 intensifi 205.9
22 divert 193.3 award 156.0 perag 220.9 ad 318.6 mechan 205.8
23 notwithstand 190.2 contrari 139.1 fraud 218.5 dissemin 288.1 fraud 204.3
24 hn 189.0 coverag 137.9 liabl 212.5 featur 280.3 fiscal 197.1
In [114]:
model_post = model.fit(tf_post)
topics_post = display_topics(model_post, tf_feature_names, no_top_words)
In [115]:
topics_post
Out[115]:
Topic 0 words Topic 0 weights Topic 1 words Topic 1 weights Topic 2 words Topic 2 weights Topic 3 words Topic 3 weights Topic 4 words Topic 4 weights
0 custom 2270.5 properti 3941.0 candid 3622.5 project 928.1 combin 3391.6
1 bank 1530.3 custom 3048.6 patent 3032.9 portfolio 715.4 warrant 1888.1
2 partner 1013.8 enterpris 1241.8 clinic 2832.2 infrastructur 525.6 target 1695.9
3 manufactur 886.3 data 1112.1 trial 2120.3 energi 504.4 sponsor 1019.0
4 credit 638.8 reput 1100.6 manufactur 1380.7 net 498.5 holder 738.9
5 supplier 624.6 brand 1067.2 licens 1261.0 issuer 478.3 entiti 571.6
6 facil 618.8 resid 973.7 intellectu 1210.7 theshar 461.5 redempt 521.1
7 suppli 603.3 consum 952.7 properti 1171.9 power 439.3 vote 496.1
8 pandem 573.5 intellectu 839.8 data 971.7 properti 427.7 per 439.9
9 flow 569.0 platfo 835.5 collabor 886.0 acquir 416.1 acquir 418.0
10 disrupt 555.7 supplier 754.2 patient 866.8 renew 392.6 unit 404.4
11 reput 447.2 record 750.0 enterpris 816.0 construct 331.0 consumm 388.5
12 action 411.1 subsidiari 743.7 enforc 715.7 note 308.5 privat 369.5
13 end 407.1 administr 739.8 medic 713.3 land 273.4 opportun 361.9
14 data 401.5 qualiti 721.8 research 694.0 trust 257.5 affili 360.9
15 loan 397.3 enforc 697.7 test 603.8 facil 256.5 founder 348.3
16 fiscal 361.0 entiti 677.4 administr 562.7 advis 251.7 trust 330.7
17 capac 342.2 fee 647.4 program 537.3 electr 242.8 member 326.7
18 litig 334.8 contractu 607.8 safe 528.6 flow 241.5 offic 314.7
19 raw 325.0 social 607.5 resid 466.8 realis 241.4 amend 314.6
20 plant 314.1 expand 602.4 effort 439.5 effici 226.2 redeem 312.7
21 economi 311.6 decemb 591.1 court 434.5 target 223.5 conflict 263.7
22 theshar 304.9 action 567.4 subsidiari 425.5 borrow 221.8 theshar 256.6
23 proceed 300.2 leas 558.5 litig 424.3 counterparti 221.8 propos 251.8
24 march 297.7 sourc 556.7 circular 423.3 transmiss 217.2 proce 250.6

Model Evaluation

In [546]:
from sklearn.model_selection import GridSearchCV
In [547]:
# Define Search Param
search_params = {'n_components': [1 , 3, 4 , 5 , 6 , 8 , 10], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(tf_pre)
Out[547]:
GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [1, 3, 4, 5, 6, 8, 10]})
In [ ]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf_post))
In [549]:
result = pd.DataFrame(model.cv_results_)
In [552]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [1 , 3, 4 , 5 , 6 , 8 , 10]
log_likelyhoods_5 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.5]
log_likelyhoods_7 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.7]
log_likelyhoods_9 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(16, 6), facecolor='lightgrey')
plt.plot(n_topics, log_likelyhoods_5, label='0.5', color= 'blue')
plt.plot(n_topics, log_likelyhoods_7, label='0.7', color= 'dodgerblue')
plt.plot(n_topics, log_likelyhoods_9, label='0.9', color= 'turquoise')
#plt.vlines(x=4, ymin=-618000, ymax=-604100, colors='lightgrey', ls='--', lw=2)
plt.title("Choosing Optimal LDA Model", size=16)
plt.xlabel("Number of Topics", size = 12)
plt.ylabel("Log Likelyhood Scores", size = 12)
plt.legend(title='Learning decay', loc='best', fontsize=12, title_fontsize=12)
plt.show()
No description has been provided for this image
In [73]:
import pyLDAvis
import pyLDAvis.sklearn
In [ ]:
pyLDAvis.enable_notebook()
test = np.matrix(tf_post)
panel = pyLDAvis.sklearn.prepare(best_lda_model, test, vectorizer, mds='tsne')
panel
In [86]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
In [87]:
lda_output = best_lda_model.transform(tf_post)

# Construct the k-means clusters
clusters = KMeans(n_clusters=15, random_state=100).fit_predict(lda_output)

# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components
lda_output_svd = svd_model.fit_transform(lda_output)

# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]

# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))

# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))
Component's weights: 
 [[ 0.32  0.07  0.94  0.05]
 [ 0.93  0.02 -0.33  0.17]]
Perc of Variance Explained: 
 [0.23 0.36]
In [89]:
# Plot
plt.figure(figsize=(5, 5))
plt.scatter(x, y, c=clusters)
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters", )
Out[89]:
Text(0.5, 1.0, 'Segregation of Topic Clusters')
No description has been provided for this image

Print word cloud¶

In [554]:
from wordcloud import WordCloud
In [ ]:
test=display_topics(model, tf_feature_names, no_top_words)
In [521]:
test['Topic 1 weights'] = [float(x) for x in test['Topic 1 weights']]
In [522]:
tuples = [tuple(x) for x in test[['Topic 1 words', 'Topic 1 weights']].values]
In [556]:
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%, 1%)")
In [558]:
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black')
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(tuples))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
No description has been provided for this image

Analyze the counted LMD words¶

In [480]:
senti_res= pd.DataFrame(senti_res, columns = ['words','positive','negative', 'positive_words', 'negative_words'])
uncert_const_res = pd.DataFrame(uncert_const_res, columns = ['words','uncertain','constraining', 'uncertain_words', 'constraining_words'])
In [517]:
senti_res['Issuer/Borrower PermID'] = csv_save_processed['Issuer/Borrower PermID'].drop([12,217])
uncert_const_res['Issuer/Borrower PermID'] = csv_save_processed['Issuer/Borrower PermID'].drop([12,217])
In [518]:
senti_res = senti_res.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
uncert_const_res = uncert_const_res.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
In [523]:
senti_res['Issue Date'] = senti_res['Issue Date'].apply(lambda x: x.date())
uncert_const_res['Issue Date'] = uncert_const_res['Issue Date'].apply(lambda x: x.date())
In [ ]:
split_date = pd.datetime(2020,1,1).date()

senti_res_pre = senti_res.loc[senti_res['Issue Date'] <= split_date]
senti_res_post = senti_res.loc[senti_res['Issue Date'] > split_date]

uncert_const_res_pre = uncert_const_res.loc[uncert_const_res['Issue Date'] <= split_date]
uncert_const_res_post = uncert_const_res.loc[uncert_const_res['Issue Date'] > split_date]
In [704]:
import itertools

senti_words_pre = pd.DataFrame(itertools.chain.from_iterable(senti_res_pre['positive_words']), columns = ['positive_words'])
senti_words_pre['negative_words'] = pd.DataFrame(itertools.chain.from_iterable(senti_res_pre['negative_words']))
other_words_pre = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_pre['uncertain_words']), columns = ['uncertain_words'])
other_words_pre['constraining_words'] = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_pre['constraining_words']))

senti_words_post = pd.DataFrame(itertools.chain.from_iterable(senti_res_post['positive_words']), columns = ['positive_words'])
senti_words_post['negative_words'] = pd.DataFrame(itertools.chain.from_iterable(senti_res_post['negative_words']))
other_words_post = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_post['uncertain_words']), columns = ['uncertain_words'])
other_words_post['constraining_words'] = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_post['constraining_words']))
In [717]:
other_words_pre['constraining_words']= other_words_post['constraining_words'].replace(np.nan,"")
other_words_post['constraining_words']= other_words_post['constraining_words'].replace(np.nan,"")
In [718]:
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem

senti_words_pre = senti_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
senti_words_post = senti_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
other_words_pre = other_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
other_words_post = other_words_post.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
In [819]:
pos_words_freq_pre = senti_words_pre['positive_words'].value_counts()
neg_words_freq_pre = senti_words_pre['negative_words'].value_counts()
uncert_words_freq_pre = other_words_pre['uncertain_words'].value_counts()
constr_words_freq_pre = other_words_pre['constraining_words'].value_counts()

pos_words_freq_post = senti_words_post['positive_words'].value_counts()
neg_words_freq_post = senti_words_post['negative_words'].value_counts()
uncert_words_freq_post = other_words_post['uncertain_words'].value_counts()
constr_words_freq_post = other_words_post['constraining_words'].value_counts()
In [820]:
uncert_words_freq_post = uncert_words_freq_post.drop(labels=['may','could','risk','might','uncertainti','depend','possibl','fluctuat'])
In [821]:
uncert_words_freq_pre = uncert_words_freq_pre.drop(labels=['may','could','risk','might','uncertainti','depend','possibl','fluctuat'])
In [822]:
neg_words_freq_pre = neg_words_freq_pre.drop(labels=['adver','failur','loss','delay','termin','unabl'])
neg_words_freq_post = neg_words_freq_post.drop(labels=['adver','failur','loss','delay','termin','unabl'])
In [823]:
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black', dpi = 400)
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(neg_words_freq_post))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
No description has been provided for this image
In [824]:
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black', dpi = 400)
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(neg_words_freq_pre))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
No description has been provided for this image

Analyze Text Similarity (NOT IN PAPER)¶

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
LemVectorizer = CountVectorizer(tokenizer=tokenizer, stop_words='english')
LemVectorizer.fit_transform(Text_TM['content'])
In [303]:
LemVectorizer.vocabulary_
Out[303]:
{'input_ids': 1, 'token_type_ids': 2, 'attention_mask': 0}
In [297]:
type(Text_TM['content'])
Out[297]:
pandas.core.series.Series
In [2]:
!jupyter nbconvert --to html NLP_Part_Risk_Factors.ipynb